import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df = pd.read_csv('news.csv')
df.shape
df.head()
df.info()
(6335, 4)
Unnamed: 0 | title | text | label | |
---|---|---|---|---|
0 | 8476 | You Can Smell Hillary’s Fear | Daniel Greenfield, a Shillman Journalism Fello... | FAKE |
1 | 10294 | Watch The Exact Moment Paul Ryan Committed Pol... | Google Pinterest Digg Linkedin Reddit Stumbleu... | FAKE |
2 | 3608 | Kerry to go to Paris in gesture of sympathy | U.S. Secretary of State John F. Kerry said Mon... | REAL |
3 | 10142 | Bernie supporters on Twitter erupt in anger ag... | — Kaydee King (@KaydeeKing) November 9, 2016 T... | FAKE |
4 | 875 | The Battle of New York: Why This Primary Matters | It's primary day in New York and front-runners... | REAL |
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6335 entries, 0 to 6334 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 6335 non-null int64 1 title 6335 non-null object 2 text 6335 non-null object 3 label 6335 non-null object dtypes: int64(1), object(3) memory usage: 198.1+ KB
y = df.label
y = y.map({'FAKE': 1, 'REAL': 0}).astype(int)
X = df['text']
y
0 1 1 1 2 0 3 1 4 0 .. 6330 0 6331 1 6332 1 6333 0 6334 0 Name: label, Length: 6335, dtype: int32
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
Let’s initialize a TfidfVectorizer with stop words from the English language and a maximum document frequency of 0.7 (terms with a higher document frequency will be discarded).
Stop words are the most common words in a language that are to be filtered out before processing the natural language data, and a TfidfVectorizer turns a collection of raw documents into a matrix of TF-IDF features.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
('classifier', RandomForestClassifier(random_state=42))])
param_grid = [
{'classifier': [RandomForestClassifier(random_state=42)],
'tfidf': [TfidfVectorizer(stop_words='english')]},
{'classifier': [PassiveAggressiveClassifier(max_iter=50)],
'tfidf': [TfidfVectorizer(stop_words='english')],
'tfidf__max_df': (0.5, 0.7),
'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]}
]
grid_search_tune = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', verbose=3)
grid_search_tune.fit(X_train, y_train)
best_model = grid_search_tune.best_estimator_
print("Best cross-validation score: {:.2f}".format(grid_search_tune.best_score_))
print("Best params:\n{}\n".format(grid_search_tune.best_params_))
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)
Fitting 5 folds for each of 7 candidates, totalling 35 fits [CV 1/5] END classifier=RandomForestClassifier(random_state=42), tfidf=TfidfVectorizer(stop_words='english'); total time= 12.7s [CV 2/5] END classifier=RandomForestClassifier(random_state=42), tfidf=TfidfVectorizer(stop_words='english'); total time= 12.9s [CV 3/5] END classifier=RandomForestClassifier(random_state=42), tfidf=TfidfVectorizer(stop_words='english'); total time= 12.2s [CV 4/5] END classifier=RandomForestClassifier(random_state=42), tfidf=TfidfVectorizer(stop_words='english'); total time= 13.1s [CV 5/5] END classifier=RandomForestClassifier(random_state=42), tfidf=TfidfVectorizer(stop_words='english'); total time= 11.9s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 1); total time= 5.6s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 1); total time= 4.9s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 1); total time= 5.0s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 1); total time= 5.0s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 1); total time= 4.8s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 2); total time= 14.0s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 2); total time= 14.2s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 2); total time= 14.7s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 2); total time= 15.1s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 2); total time= 14.5s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 3); total time= 25.7s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 3); total time= 26.3s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 3); total time= 26.1s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 3); total time= 26.8s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.5, tfidf__ngram_range=(1, 3); total time= 28.0s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 1); total time= 6.9s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 1); total time= 7.0s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 1); total time= 7.9s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 1); total time= 7.4s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 1); total time= 7.0s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 2); total time= 13.8s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 2); total time= 13.4s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 2); total time= 14.4s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 2); total time= 14.3s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 2); total time= 14.9s [CV 1/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 3); total time= 27.5s [CV 2/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 3); total time= 25.6s [CV 3/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 3); total time= 24.9s [CV 4/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 3); total time= 24.5s [CV 5/5] END classifier=PassiveAggressiveClassifier(max_iter=50), tfidf=TfidfVectorizer(stop_words='english'), tfidf__max_df=0.7, tfidf__ngram_range=(1, 3); total time= 24.4s
GridSearchCV(cv=5, estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')), ('classifier', RandomForestClassifier(random_state=42))]), param_grid=[{'classifier': [RandomForestClassifier(random_state=42)], 'tfidf': [TfidfVectorizer(stop_words='english')]}, {'classifier': [PassiveAggressiveClassifier(max_iter=50)], 'tfidf': [TfidfVectorizer(max_df=0.7, ngram_range=(1, 2), stop_words='english')], 'tfidf__max_df': (0.5, 0.7), 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]}], scoring='accuracy', verbose=3)
Best cross-validation score: 0.94 Best params: {'classifier': PassiveAggressiveClassifier(max_iter=50), 'tfidf': TfidfVectorizer(max_df=0.7, ngram_range=(1, 2), stop_words='english'), 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 2)} Best parameters set: [('tfidf', TfidfVectorizer(max_df=0.7, ngram_range=(1, 2), stop_words='english')), ('classifier', PassiveAggressiveClassifier(max_iter=50))]
scores_best_models = pd.DataFrame(grid_search_tune.cv_results_)[['param_classifier','mean_test_score','std_test_score','rank_test_score']]
scores_best_models.sort_values(by='rank_test_score')
param_classifier | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|
5 | PassiveAggressiveClassifier(max_iter=50) | 0.937978 | 0.008447 | 1 |
2 | PassiveAggressiveClassifier(max_iter=50) | 0.937303 | 0.010896 | 2 |
4 | PassiveAggressiveClassifier(max_iter=50) | 0.934370 | 0.008239 | 3 |
1 | PassiveAggressiveClassifier(max_iter=50) | 0.933920 | 0.006934 | 4 |
6 | PassiveAggressiveClassifier(max_iter=50) | 0.929634 | 0.011469 | 5 |
3 | PassiveAggressiveClassifier(max_iter=50) | 0.929183 | 0.011963 | 6 |
0 | RandomForestClassifier(random_state=42) | 0.894448 | 0.015337 | 7 |
y_pred = best_model.predict(X_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
Accuracy: 93.79%
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
InteractiveShell.ast_node_interactivity = "last_expr"
cf_matrix = confusion_matrix(y_test, y_pred)
# Names, counts and Percentage
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
# Join the above into text
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
categories = ['REAL (0)','FAKE (1)']
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues',xticklabels=categories,yticklabels=categories)
plt.title('Best Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.92 0.95 0.94 933 1 0.95 0.92 0.94 968 accuracy 0.94 1901 macro avg 0.94 0.94 0.94 1901 weighted avg 0.94 0.94 0.94 1901